# Loading package
library(ChromENVEE)
Generated vector to associated each chromatin state number
state_numer to a name state_name and a color
color_value (usable for plot generation)
state_number = c("U1","U2","U3","U4","U5","U6","U7","U8","U9","U10","U11","U12","U13","U14","U15","U16","U17","U18")
state_name = c("TSSA","TSSFlnk","TSSFlnkD","Tx","TxWk","EnhG","EnhG","EnhA","EnhWk","ZNF/Rpts","Het","TssBiv","EnhBiv","ReprPC","ReprPCWk","Quies","Quies","Quies")
color_value = c("#B71C1C","#E65100","#E65100","#43A047","#1B5E20","#99FF66","#99FF66","#F5B041","#FFEB3B","#48C9B0","#B39DDB","#880E4F","#666633","#424949","#7B7D7D","#D0D3D4","#D0D3D4","#D0D3D4")
Download the reference genome as a .bed file, in this
study we use mouse genome (mm10).
data(genome_file)
#> chr start end strand score gene_ENS
#> 1 chr1 3073253 3074322 + . ENSMUSG00000102693.1
#> 2 chr1 3102016 3102125 + . ENSMUSG00000064842.1
#> 3 chr1 3205901 3671498 - . ENSMUSG00000051951.5
#> 4 chr1 3252757 3253236 + . ENSMUSG00000102851.1
#> 5 chr1 3365731 3368549 - . ENSMUSG00000103377.1
#> 6 chr1 3375556 3377788 - . ENSMUSG00000104017.1
Download the chromatin state file (output of ChromHMM). It’s a
.bed file which contains informations like genomic
position, information about chromatin state.
data(chromatin_state)
#> chr start end state name state_name
#> 1 chr10 0 3100000 U16 RS Quies
#> 2 chr10 3100000 3109200 U11 RS Het
#> 3 chr10 3109200 3110600 U12 RS TssBiv
#> 4 chr10 3110600 3111000 U14 RS ReprPC
#> 5 chr10 3111000 3111200 U13 RS EnhBiv
#> 6 chr10 3111200 3117200 U12 RS TssBiv
We are interested to know the distribution of chromatin state in the
genome. plotChromatinState calculates the percentage of
each chromatin state in function the length of the genome used. We
obtains a data frame with the percentage of coverage for each chromatin
state and it’s possible to plot the result in .png
file.
summary_chromatin_state = plotChromatinState(chromatin_state, state_name = state_name, state_number = state_number, merge = TRUE, plot = FALSE, color = color_value, filename = "")
head(summary_chromatin_state)
#> state coverage sample_name
#> TSSA TSSA 0.08519426 RS
#> TSSFlnk TSSFlnk 0.45530134 RS
#> TSSFlnkD TSSFlnkD 1.18900667 RS
#> Tx Tx 2.60257103 RS
#> TxWk TxWk 2.44911129 RS
#> EnhG EnhG 7.10081351 RS
We are interested to associated at each enhancer, genes regulated by
the enhancer. We focused on enhancer chromatin state (in this study, we
have 4 tpe of enhancer : bivalent enhancer (EnhBiv), genic enhancer
(EnhG), active enhancer (EnhA) and weak enhancer (EnhWk)). The file used
must to be a GRanges file
data(list_table_enhancer)
#> GRanges object with 1979 ranges and 1 metadata column:
#> seqnames ranges strand | chromatin_state
#> <Rle> <IRanges> <Rle> | <character>
#> [1] chr10 9164400-9164800 * | U13
#> [2] chr10 9342200-9344000 * | U13
#> [3] chr10 10476400-10476600 * | U13
#> [4] chr10 20520200-20521000 * | U13
#> [5] chr10 20952400-20952600 * | U13
#> ... ... ... ... . ...
#> [1975] chrX 144286800-144287000 * | U13
#> [1976] chrX 155128400-155129200 * | U13
#> [1977] chrX 170010800-170013800 * | U13
#> [1978] chrY 198400-198800 * | U13
#> [1979] chrY 90786000-90788000 * | U13
#> -------
#> seqinfo: 21 sequences from an unspecified genome; no seqlengths
To estimated which gene is regulated by enhancer, we estimated that
genes associated enhancer, all TSS genes in interval around enhancer.
enhancerAnnotation() take few minutes to process in
function the length of your enhancer table. It’s possible to multithread
the job with the nCore parameter. For each enhancer
position, we obtains two informations, the distnace between gene and
enhancer (in bp) and the gene
table_enhancer_gene = enhancerAnnotation(list_table_enhancer[[1]],genome = genome_file,interval = 500000, nCore = 1)
#> GRanges object with 1979 ranges and 6 metadata columns:
#> seqnames ranges strand | chromatin_state start_500kb end_500kb
#> <Rle> <IRanges> <Rle> | <character> <numeric> <numeric>
#> 1 chr10 9164400-9164800 * | U13 8664400 9664800
#> 1 chr10 9342200-9344000 * | U13 8842200 9844000
#> 1 chr10 10476400-10476600 * | U13 9976400 10976600
#> 1 chr10 20520200-20521000 * | U13 20020200 21021000
#> 1 chr10 20952400-20952600 * | U13 20452400 21452600
#> . ... ... ... . ... ... ...
#> 1 chrX 144286800-144287000 * | U13 143786800 144787000
#> 1 chrX 155128400-155129200 * | U13 154628400 155629200
#> 1 chrX 170010800-170013800 * | U13 169510800 170513800
#> 1 chrY 198400-198800 * | U13 -301600 698800
#> 1 chrY 90786000-90788000 * | U13 90286000 91288000
#> gene_association distance gene_list
#> <integer> <character> <character>
#> 1 19 451159;278330;340253.. ENSMUSG00000111215.1..
#> 1 21 456130;480757;457563.. ENSMUSG00000015305.6..
#> 1 20 499773;435480;392457.. ENSMUSG00000101621.2..
#> 1 16 371729;318362;311710.. ENSMUSG00000019996.1..
#> 1 21 227322;432632;326765.. ENSMUSG00000019990.1..
#> . ... ... ...
#> 1 17 459386;459456;353489.. ENSMUSG00000067276.5..
#> 1 29 492849;417142;353947.. ENSMUSG00000084367.3..
#> 1 7 325601;314901;231165.. ENSMUSG00000035299.1..
#> 1 3 7351;124495;496977 ENSMUSG00000101796.1..
#> 1 9 384752;254355;180136.. ENSMUSG00000096178.7..
#> -------
#> seqinfo: 21 sequences from an unspecified genome; no seqlengths
We want to know the disribution of number of genes associated at each enhancer to adapted future filters
plotGeneAssociation(table_enhancer_gene, all = F)
We associated the expression level at each gene associated to enhancer
data(geneExpression)
#> gene_ENS chr start end strand score gene_expression
#> 1 ENSMUSG00000000001.4 chr3 108107280 108146146 - . 27.7106904
#> 2 ENSMUSG00000000028.15 chr16 18780447 18811987 - . 23.5842993
#> 3 ENSMUSG00000000031.16 chr7 142575529 142578143 - . 0.9386427
#> 4 ENSMUSG00000000037.16 chrX 161117193 161258213 + . 14.4548991
#> 5 ENSMUSG00000000049.11 chr11 108343354 108414396 + . 36.6169129
#> 6 ENSMUSG00000000056.7 chr11 121237253 121255856 + . 5.2791187
table_enhancer_gene_expression = enhancerExpression(table_enhancer_gene, gene_expression_table = geneExpression)
#> GRanges object with 1979 ranges and 8 metadata columns:
#> seqnames ranges strand | chromatin_state start_500kb end_500kb
#> <Rle> <IRanges> <Rle> | <character> <numeric> <numeric>
#> 1 chr10 9164400-9164800 * | U13 8664400 9664800
#> 1 chr10 9342200-9344000 * | U13 8842200 9844000
#> 1 chr10 10476400-10476600 * | U13 9976400 10976600
#> 1 chr10 20520200-20521000 * | U13 20020200 21021000
#> 1 chr10 20952400-20952600 * | U13 20452400 21452600
#> . ... ... ... . ... ... ...
#> 1 chrX 144286800-144287000 * | U13 143786800 144787000
#> 1 chrX 155128400-155129200 * | U13 154628400 155629200
#> 1 chrX 170010800-170013800 * | U13 169510800 170513800
#> 1 chrY 198400-198800 * | U13 -301600 698800
#> 1 chrY 90786000-90788000 * | U13 90286000 91288000
#> gene_association distance gene_list sample_name
#> <integer> <character> <character> <character>
#> 1 19 451159;278330;340253.. ENSMUSG00000111215.1.. RS
#> 1 21 456130;480757;457563.. ENSMUSG00000015305.6.. RS
#> 1 20 499773;435480;392457.. ENSMUSG00000101621.2.. RS
#> 1 16 371729;318362;311710.. ENSMUSG00000019996.1.. RS
#> 1 21 227322;432632;326765.. ENSMUSG00000019990.1.. RS
#> . ... ... ... ...
#> 1 17 459386;459456;353489.. ENSMUSG00000067276.5.. RS
#> 1 29 492849;417142;353947.. ENSMUSG00000084367.3.. RS
#> 1 7 325601;314901;231165.. ENSMUSG00000035299.1.. RS
#> 1 3 7351;124495;496977 ENSMUSG00000101796.1.. RS
#> 1 9 384752;254355;180136.. ENSMUSG00000096178.7.. RS
#> gene_expression
#> <character>
#> 1 NA;12.8456863815602;..
#> 1 12.8456863815602;2.0..
#> 1 NA;NA;NA;NA;NA;NA;NA..
#> 1 102.374504394998;2.0..
#> 1 0.571438637996035;3...
#> . ...
#> 1 0.255147894054393;NA..
#> 1 NA;NA;NA;12.00904498..
#> 1 0.161867853849147;NA..
#> 1 NA;NA;NA
#> 1 2.25309950916964;0.7..
#> -------
#> seqinfo: 21 sequences from an unspecified genome; no seqlengths
plotDistanceExpression(table_enhancer_gene_expression,color = color_value,
state_name = state_name, state_number = state_number)
plotGeneDistance(table_enhancer_gene_expression)
plotEnhancerExpression(table_enhancer_gene_expression, scale = "log10",
color = color_value, state_name = state_name, state_number = state_number)
# plotGeneAssociation(list_table_enhancer_gene_sample,all = T)
# plotDistanceExpression(list_table_enhancer_gene_expression, color = color_value, state_name = state_name, state_number = state_number)
# plotGeneDistance(list_table_enhancer_gene_expression)
# plotEnhancerExpression(list_table_enhancer_gene_expression, scale = "log10", color = color_value, state_name = state_name, state_number = state_number)
state_order_reduce = c("TSSA","TSSFlnk","TSSFlnk","Tx","Tx","EnhG","EnhG","EnhA","EnhWk","ZNF.Rpts","Het","TssBiv","EnhBiv","ReprPC","ReprPC","Quies","Quies","Quies")
data(geneExpression)
data(chromatin_state)
table_overlapping = geneEnvironment(geneExpression,chromatin_state, unique(state_order_reduce))
rownames(table_overlapping) = table_overlapping$gene_ENS
#> gene_ENS chr start end strand
#> ENSMUSG00000000001.4 ENSMUSG00000000001.4 chr3 108107280 108146146 -
#> ENSMUSG00000000028.15 ENSMUSG00000000028.15 chr16 18780447 18811987 -
#> ENSMUSG00000000031.16 ENSMUSG00000000031.16 chr7 142575529 142578143 -
#> ENSMUSG00000000037.16 ENSMUSG00000000037.16 chrX 161117193 161258213 +
#> ENSMUSG00000000049.11 ENSMUSG00000000049.11 chr11 108343354 108414396 +
#> ENSMUSG00000000056.7 ENSMUSG00000000056.7 chr11 121237253 121255856 +
#> score gene_expression TSS TSS_moins_3kb
#> ENSMUSG00000000001.4 . 27.7106904 108146146 108143146
#> ENSMUSG00000000028.15 . 23.5842993 18811987 18808987
#> ENSMUSG00000000031.16 . 0.9386427 142578143 142575143
#> ENSMUSG00000000037.16 . 14.4548991 161117193 161114193
#> ENSMUSG00000000049.11 . 36.6169129 108343354 108340354
#> ENSMUSG00000000056.7 . 5.2791187 121237253 121234253
#> TSS_plus_3kb TSSA TSSFlnk Tx EnhG EnhA
#> ENSMUSG00000000001.4 108149146 0.00000000 0.00000000 0 0.7423333 0
#> ENSMUSG00000000028.15 18814987 0.00000000 0.06666667 0 0.6333333 0
#> ENSMUSG00000000031.16 142581143 0.00000000 0.00000000 0 0.0000000 0
#> ENSMUSG00000000037.16 161120193 0.03333333 0.40000000 0 0.0000000 0
#> ENSMUSG00000000049.11 108346354 0.00000000 0.00000000 0 0.0000000 0
#> ENSMUSG00000000056.7 121240253 0.00000000 0.06666667 0 0.6000000 0
#> EnhWk ZNF.Rpts Het TssBiv EnhBiv ReprPC Quies
#> ENSMUSG00000000001.4 0.0000 0 0 0.2576667 0.0 0.0000 0.0000000
#> ENSMUSG00000000028.15 0.0000 0 0 0.3000000 0.0 0.0000 0.0000000
#> ENSMUSG00000000031.16 0.0000 0 0 0.0000000 0.4 0.3095 0.2905000
#> ENSMUSG00000000037.16 0.1655 0 0 0.0000000 0.0 0.0000 0.4011667
#> ENSMUSG00000000049.11 0.0000 0 0 0.3000000 0.3 0.3410 0.0590000
#> ENSMUSG00000000056.7 0.0000 0 0 0.3333333 0.0 0.0000 0.0000000
result_umap = predominentState(table_overlapping, state = unique(state_order_reduce),
header = unique(state_order_reduce) ,neighbors = 32, metric = "euclidean", dist = 0.5)
#> It will be take few minutes to process
#> [1] "umap-learn already installed"
#> TSSA TSSFlnk Tx EnhG EnhA EnhWk ZNF.Rpts
#> ENSMUSG00000000001.4 0.00000000 0.00000000 0 0.7423333 0 0.0000 0
#> ENSMUSG00000000028.15 0.00000000 0.06666667 0 0.6333333 0 0.0000 0
#> ENSMUSG00000000031.16 0.00000000 0.00000000 0 0.0000000 0 0.0000 0
#> ENSMUSG00000000037.16 0.03333333 0.40000000 0 0.0000000 0 0.1655 0
#> ENSMUSG00000000049.11 0.00000000 0.00000000 0 0.0000000 0 0.0000 0
#> ENSMUSG00000000056.7 0.00000000 0.06666667 0 0.6000000 0 0.0000 0
#> Het TssBiv EnhBiv ReprPC Quies UMAP1
#> ENSMUSG00000000001.4 0 0.2576667 0.0 0.0000 0.0000000 9.531457
#> ENSMUSG00000000028.15 0 0.3000000 0.0 0.0000 0.0000000 -11.343992
#> ENSMUSG00000000031.16 0 0.0000000 0.4 0.3095 0.2905000 8.796690
#> ENSMUSG00000000037.16 0 0.0000000 0.0 0.0000 0.4011667 2.249324
#> ENSMUSG00000000049.11 0 0.3000000 0.3 0.3410 0.0590000 7.513260
#> ENSMUSG00000000056.7 0 0.3333333 0.0 0.0000 0.0000000 -11.085731
#> UMAP2 state
#> ENSMUSG00000000001.4 -15.5334320 EnhG
#> ENSMUSG00000000028.15 0.2111701 EnhG
#> ENSMUSG00000000031.16 -0.7768648 EnhBiv
#> ENSMUSG00000000037.16 -6.8519120 Quies
#> ENSMUSG00000000049.11 2.5025253 ReprPC
#> ENSMUSG00000000056.7 -1.0832608 EnhG
Here is the output of sessionInfo() on the system on
which this document was compiled:
#> R version 4.1.3 (2022-03-10)
#> Platform: x86_64-conda-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.6 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /home/mcoulee/anaconda3/envs/R_package_3/lib/libopenblasp-r0.3.20.so
#>
#> locale:
#> [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=fr_FR.UTF-8
#> [5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=fr_FR.UTF-8
#> [7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ChromENVEE_1.1.5
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.2 pkgload_1.3.0 jsonlite_1.8.0
#> [4] splines_4.1.3 here_1.0.1 bslib_0.4.0
#> [7] assertthat_0.2.1 highr_0.9 stats4_4.1.3
#> [10] umapr_0.0.0.9001 GenomeInfoDbData_1.2.7 yaml_2.3.5
#> [13] remotes_2.4.2 sessioninfo_1.2.2 pillar_1.8.0
#> [16] lattice_0.20-45 glue_1.6.2 reticulate_1.25
#> [19] digest_0.6.29 GenomicRanges_1.46.1 XVector_0.34.0
#> [22] colorspace_2.0-3 htmltools_0.5.3 Matrix_1.4-1
#> [25] pkgconfig_2.0.3 devtools_2.4.3 zlibbioc_1.40.0
#> [28] purrr_0.3.4 scales_1.2.0 processx_3.7.0
#> [31] tibble_3.1.7 mgcv_1.8-40 generics_0.1.3
#> [34] farver_2.1.1 IRanges_2.28.0 ggplot2_3.3.6
#> [37] usethis_2.1.6 ellipsis_0.3.2 cachem_1.0.6
#> [40] withr_2.5.0 BiocGenerics_0.40.0 cli_3.3.0
#> [43] magrittr_2.0.3 crayon_1.5.1 memoise_2.0.1
#> [46] evaluate_0.15 ps_1.7.1 fs_1.5.2
#> [49] fansi_1.0.3 nlme_3.1-158 pkgbuild_1.3.1
#> [52] tools_4.1.3 prettyunits_1.1.1 lifecycle_1.0.1
#> [55] stringr_1.4.0 S4Vectors_0.32.3 munsell_0.5.0
#> [58] callr_3.7.1 compiler_4.1.3 jquerylib_0.1.4
#> [61] GenomeInfoDb_1.30.0 rlang_1.0.4 grid_4.1.3
#> [64] RCurl_1.98-1.7 rappdirs_0.3.3 bitops_1.0-7
#> [67] labeling_0.4.2 rmarkdown_2.14 gtable_0.3.0
#> [70] DBI_1.1.3 curl_4.3.2 R6_2.5.1
#> [73] knitr_1.39 dplyr_1.0.9 fastmap_1.1.0
#> [76] utf8_1.2.2 rprojroot_2.0.3 stringi_1.7.8
#> [79] parallel_4.1.3 Rcpp_1.0.9 vctrs_0.4.1
#> [82] png_0.1-7 tidyselect_1.1.2 xfun_0.31
ChromHMM Papier scientifique associé umapr genomicRanges